knitr::opts_chunk$set(echo = TRUE)
This file records the calculation metrics for the manuscript "Reliable biodiversity metrics from co-occurence based post-clustering curation of amplicon data".
For each table the following metrics are calculated:
(a) Linear regression of OTU richness vs Plant richness for the 130 samples (including r^2^ value),
(b) Number of OTUs,
(c) Taxonomic redundancy,
(d) Betadiversity (species/OTU turnover between sites), and
(e) Community dissimilarity at genus level between plant inventory and taxonomic composition of OTUs.
This step should be carried out after the LULU curation of the OTU tables documented in the file: E_Taxonomic_filtering.Rmd
NB: All markdown chuncks are set to "eval=FALSE". Change these accordingly. Also code blocks to be run outside R, has been #'ed out. Change this accordingly.
Make sure that you have the following bioinformatic tools in your PATH
R packages: stringr, dplyr, tidyr
This step is dependent on the presence of OTU tables (un-curated tables and the corresponding tables curated with LULU)
Setting directories and libraries etc
setwd("~/analyses") main_path <- getwd() path <- file.path(main_path, "otutables_processing") library(stringr) library(dplyr) library(tidyr) require(vegan)
For each of the OTU tables: Calculating the OTU richness plot wise. For each method/table calculate taxonomic redundancy, total OTU count, Number of unique taxonomic names, Number of taxonomic names which are also present in the observational data
allFiles <- list.files(path) all_plTabs <- allFiles[grepl("planttable$", allFiles)] all_prTabs <- allFiles[grepl("planttable.luluprocessed$", allFiles)] all_Tabs <- c(all_plTabs,all_prTabs) read_tabs <- file.path(path, all_Tabs) # Vector for filtering, etc. at this step redundant, but included for safety samples <- c("S001","S002","S003","S004","S005","S006","S007","S008","S067", "S009","S010","S011","S012","S013","S014","S040","S068","S015", "S016","S017","S018","S069","S070","S019","S020","S021","S022", "S024","S025","S026","S027","S041","S028","S029","S030","S032", "S033","S034","S035","S042","S036","S037","S038","S039","S086", "S087","S088","S089","S044","S071","S045","S046","S047","S048", "S049","S050","S051","S052","S053","S055","S056","S057","S058", "S090","S059","S060","S061","S062","S063","S064","S065","S066", "S072","S073","S074","S075","S076","S077","S078","S091","S079", "S080","S081","S082","S083","S084","S085","S092","S094","S095", "S096","S097","S098","S099","S100","S101","S102","S103","S104", "S106","S107","S108","S109","S133","S110","S111","S112","S113", "S114","S115","S116","S117","S118","S119","S120","S121","S122", "S123","S124","S134","S125","S126","S127","S129","S130","S131", "S132","S135","S136","S137") tab_name <- file.path(main_path,"Table_otu_taxonomy_plant_levels.txt") otutaxonomy <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE) tab_name <- file.path(main_path,"Table_plants_2014_cleaned.txt") Plant_data2014 <- read.table(tab_name, sep="\t", row.names = 1, header=TRUE, as.is=TRUE) Plant_richness <- colSums(Plant_data2014) otu_richness <- data.frame(matrix(NA, nrow = 130, ncol = length(all_Tabs))) names(otu_richness) <- all_Tabs rel_redundancy <- vector() total_otu <- vector() mean_pident <- vector() corcoeffs <- vector() betadiversity <- vector() ##inserted lm_intercept <- vector() lm_slope <- vector() read_sum <- vector() Num_otu_taxa_method <- vector() otu_taxa_method <- list() singleton_share <- vector() doubleton_share <- vector() ab_diss <- list() pa_diss <- list() ##inserted until here for(i in seq_along(read_tabs)) { tab <- read.csv(read_tabs[i],sep='\t',header=T,as.is=TRUE,row.names = 1) #read table tab <- tab[,samples] # order samples otu_richness[,i] = colSums(tab>0) # calculate plot wise richness amp_index <- row.names(tab) #OTU id's of current table reftaxindex <- which(otutaxonomy$qseqid %in% amp_index) # index of which OTUs are present in the current table ##inserted perfect_match_index <- which(otutaxonomy$pident == 100 & otutaxonomy$qseqid %in% amp_index) # index of which OTUs are present in the current otu_taxa_method[[i]] <- names(table(otutaxonomy$species[perfect_match_index])) #Which species names have been identified in the current table Num_otu_taxa_method[i] <- length(otu_taxa_method[[i]]) # Number of plant species names ## until here mean_pident[[i]] <- mean(otutaxonomy$pident[reftaxindex]) # average genbank match % spec <- otutaxonomy$species[reftaxindex] # names of all OTUs redundancy <- sum((table(spec) -1)) # count of taxonomically redundant OTUs total_otu[i] <- nrow(tab) #total number of OTUs present in the table betadiversity[i] <- total_otu[i]/mean(otu_richness[,i]) rel_redundancy[i] <- redundancy/total_otu[i] # relative redundancy # R^2 of linear regression of OTU richness vs plant richness corcoeffs[i] <- (cor(Plant_richness,otu_richness[,i]))^2 lm_fit <- lm(otu_richness[,i]~ Plant_richness) lm_intercept[i] <- lm_fit$coefficients[1] lm_slope[i] <- lm_fit$coefficients[2] read_sum[i] <- sum(tab) #Inserted. community dissimilarity stable <- tab trans_table <- t(stable) rowindex <- rowSums(trans_table) != 0 trans_table <- trans_table[rowindex,] stand_table <- decostand(trans_table, "hellinger") ab_diss[[i]] <- vegdist(stand_table, method="bray", binary=FALSE) pa_diss[[i]] <- vegdist(stand_table, method="bray", binary=TRUE) #inserted until here #inserted tab2 <- tab tab2[tab2>1] <- 1 singleton_share[i] <- sum(rowSums(tab2)==1)/total_otu[i] doubleton_share[i] <- sum(rowSums(tab2)==2)/total_otu[i] } ## Inserted #MANTEL test for curation effect on both presence absence (pa) tables and abundance tables (ab) p_table <- Plant_data2014 names(p_table) <- samples trans_p_table <- t(p_table) rowindex <- rowSums(trans_p_table) != 0 trans_p_table <- trans_p_table[rowindex,] plant_pa_diss <- vegdist(trans_p_table, method="bray", binary=TRUE) site_names <- names(plant_table) pa_curation <- list() # Manteltest for curation effect on pa data ab_curation <- list() # Manteltest for curation effect on ab data pa_statistic <- vector() # Mantel statistic r (pa) pa_signif <- vector() # significance level (pa) ab_statistic <- vector() # Mantel statistic r (ab) ab_signif <- vector() # significance level (ab) for(i in 1:(length(read_tabs)/2)) { pa_curation[[i]] <- mantel(pa_diss[[i]], pa_diss[[i+20]], method="pearson", permutations=999) pa_statistic[i] <- pa_curation[[i]]$statistic pa_signif[i] <- pa_curation[[i]]$signif ab_curation[[i]] <- mantel(ab_diss[[i]], ab_diss[[i+20]], method="pearson", permutations=999) ab_statistic[i] <- ab_curation[[i]]$statistic ab_signif[i] <- ab_curation[[i]]$signif } #MANTEL test for correlation with plant data on both presence absence (pa) tables and abundance tables (ab) pa_vs_plant <- list() # Manteltest for plant data vd sequence data (pa) ab_vs_plant <- list() # Manteltest for plant data vd sequence data (ab) pa_vs_plant_statistic <- vector() # Mantel statistic r (pa) pa_vs_plant_signif <- vector() # significance level (pa) ab_vs_plant_statistic <- vector() # Mantel statistic r (ab) ab_vs_plant_signif <- vector() # significance level (ab) for(i in 1:(length(read_tabs))) { pa_vs_plant[[i]] <- mantel(plant_pa_diss, pa_diss[[i]], method="pearson", permutations=999) pa_vs_plant_statistic[i] <- pa_vs_plant[[i]]$statistic pa_vs_plant_signif[i] <- pa_vs_plant[[i]]$signif ab_vs_plant[[i]] <- mantel(plant_pa_diss, ab_diss[[i]], method="pearson", permutations=999) ab_vs_plant_statistic[i] <- ab_vs_plant[[i]]$statistic ab_vs_plant_signif[i] <- ab_vs_plant[[i]]$signif } ##Inserted until here #Synchronize names for methods, levels and curation state and # collect table statistics in one table method <- str_split_fixed(all_Tabs, "_", 3)[,1] method[method == "DADA2"] <- "DADA2(+VS)" method[method == "DADA2VSEARCH"] <- "DADA2(+VS)" level <- str_split_fixed(all_Tabs, "_", 3)[,2] level <- gsub(".planttable","",level) level[level == "0.95"] <- "95" level[level == "0.96"] <- "96" level[level == "0.97"] <- "97" level[level == "0.98"] <- "98" level[level == "0.985"] <- "98.5" level[level == "NO"] <- "99/100" level[level == "3"] <- "99/100" level[level == "5"] <- "98.5" level[level == "7"] <- "98" level[level == "10"] <- "97" level[level == "13"] <- "96" level[level == "15"] <- "95" level <- factor(level,levels = c("99/100", "98.5", "98", "97", "96", "95")) #identify LULU curated tables processed <- str_split_fixed(all_Tabs, "_", 3)[,3] luluindex <- which(processed == "luluprocessed") processed[luluindex] <- "curated" processed[-luluindex] <- "raw" #Merge all results in one table method_statistics <- data.frame(Method=method,Level=level, Curated=processed,Correlation=corcoeffs, Redundancy=rel_redundancy,OTU_count=total_otu, Mean_match=mean_pident,Beta=betadiversity, Intercept = lm_intercept, Slope=lm_slope, Total_readcount = read_sum, Taxa=Num_otu_taxa_method, Singleton=singleton_share,Doubleton=doubleton_share, Com_dissim_PA_stat=pa_vs_plant_statistic, Com_dissim_PA_sig=pa_vs_plant_signif, Com_dissim_AB_stat=ab_vs_plant_statistic, Com_dissim_AB_sig=ab_vs_plant_signif) tab_name <- file.path(main_path,"Table_method_statistics_updated_revision1.txt") {write.table(method_statistics, tab_name, sep="\t",quote=FALSE, col.names = NA)} #Merge manteltests on curation effect in one table mantel_tests_curation_effet <- data.frame(Method=method[1:20], Level=level[1:20], PA_curation = pa_statistic, PA_significance=pa_signif, AB_curation=ab_statistic, AB_significance=ab_signif) tab_name <- file.path(main_path,"Table_manteltests.txt") {write.table(mantel_tests_curation_effet, tab_name, sep="\t",quote=FALSE, col.names = NA)}
Construct a full plant richness vs OTU richness table and synchronize names for methods, levels and curation state
# add Plant richness to OTU richness dataframe richness_incl_obs <- cbind(Obs_richness=Plant_richness,otu_richness) total_richness_df <- gather(richness_incl_obs, key=Method, value=OTU_richness,-1) method <- str_split_fixed(total_richness_df$Method, "_", 3)[,1] method[method == "DADA2"] <- "DADA2(+VS)" method[method == "DADA2VSEARCH"] <- "DADA2(+VS)" level <- str_split_fixed(total_richness_df$Method, "_", 3)[,2] level <- gsub(".planttable","",level) level[level == "0.95"] <- "95" level[level == "0.96"] <- "96" level[level == "0.97"] <- "97" level[level == "0.98"] <- "98" level[level == "0.985"] <- "98.5" level[level == "NO"] <- "99/100" level[level == "3"] <- "99/100" level[level == "5"] <- "98.5" level[level == "7"] <- "98" level[level == "10"] <- "97" level[level == "13"] <- "96" level[level == "15"] <- "95" level[level == "100"] <- "99" level <- factor(level,levels = c("99/100", "98.5", "98", "97", "96", "95")) processed <- str_split_fixed(total_richness_df$Method, "_", 3)[,3] luluindex <- which(processed == "luluprocessed") processed[luluindex] <- "curated" processed[-luluindex] <- "raw" total_richness_df2 <- data.frame(Method=method,Level=level,Curated=processed, Obs=total_richness_df$Obs_richness, OTU=total_richness_df$OTU_richness) #save a long formatted table for ggplot tab_name <- file.path(main_path,"Table_richness_calculations_long.txt") {write.table(total_richness_df2, tab_name, sep="\t",quote=FALSE, col.names = NA)} #save a wide formatted table for overview tab_name <- file.path(main_path,"Table_richness_calculations_wide.txt") {write.table(richness_incl_obs, tab_name, sep="\t",quote=FALSE, col.names = NA)}
Construct a table with the best match (%) for each OTU pr method/table separating retained and discarded OTUs.
allFiles <- list.files(path) all_prTabs <- allFiles[grepl("LULU-", allFiles)] prTab_names <- sort(as.vector(sapply(all_prTabs, function(x) strsplit(x, "LULU-")[[1]][2]))) rds <- list() read_tabs <- file.path(path, all_prTabs) tab_name <- file.path(main_path,"Table_otu_taxonomy_plant_levels.txt") otutaxonomy <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE) retained_avg <- vector() discarded_avg <- vector() pident_frame <- data.frame() for(i in seq_along(read_tabs)) { rds <- readRDS(read_tabs[i]) # read the saved rds LULU output retained_amplicons <- rds$cured_OTUs # extract the cured/retained OTUs discarded_amplicons <- rds$discarded_OTUs number_observations <- length(retained_amplicons)+length(discarded_amplicons) retained_index <- which(otutaxonomy$qseqid %in% retained_amplicons) discarded_index <- which(otutaxonomy$qseqid %in% discarded_amplicons) retained_pident <- otutaxonomy$pident[retained_index] discarded_pident <- otutaxonomy$pident[discarded_index] method_pident <- rep(prTab_names[i],number_observations) retained_or_discarded <- c(rep("retained",length(retained_amplicons)), rep("discarded",length(discarded_amplicons))) pident <- c(retained_pident,discarded_pident) current_pident_frame <- data.frame(method_pident,retained_or_discarded,pident) pident_frame <- rbind(pident_frame,current_pident_frame) # retained_avg[i] <- mean(retained_pident) # discarded_avg[i] <- mean(discarded_pident) } #Synchronize names for methods, levels and curation state pident_frame$level_pident <- str_split_fixed(as.character(pident_frame$method_pident), "_", 2)[,2] pident_frame$method_pident <- str_split_fixed(as.character(pident_frame$method_pident), "_", 2)[,1] pident_frame$method_pident[pident_frame$method_pident == "DADA2"] <- "DADA2(+VS)" pident_frame$method_pident[pident_frame$method_pident == "DADA2VSEARCH"] <- "DADA2(+VS)" pident_frame$level_pident[pident_frame$level_pident == "0.95"] <- "95" pident_frame$level_pident[pident_frame$level_pident == "0.96"] <- "96" pident_frame$level_pident[pident_frame$level_pident == "0.97"] <- "97" pident_frame$level_pident[pident_frame$level_pident == "0.98"] <- "98" pident_frame$level_pident[pident_frame$level_pident == "0.985"] <- "98.5" pident_frame$level_pident[pident_frame$level_pident == "NO"] <- "99/100" pident_frame$level_pident[pident_frame$level_pident == "3"] <- "99/100" pident_frame$level_pident[pident_frame$level_pident == "5"] <- "98.5" pident_frame$level_pident[pident_frame$level_pident == "7"] <- "98" pident_frame$level_pident[pident_frame$level_pident == "10"] <- "97" pident_frame$level_pident[pident_frame$level_pident == "13"] <- "96" pident_frame$level_pident[pident_frame$level_pident == "15"] <- "95" pident_frame$level_pident <- factor(pident_frame$level_pident,levels = c("99/100", "98.5", "98", "97", "96", "95")) tab_name <- file.path(main_path,"Table_OTU_match_rates.txt") {write.table(pident_frame, tab_name, sep="\t",quote=FALSE, col.names = NA)}
Taxonomic dissimilarity
Constructing genus level OTU tables. Read counts are reduced to presence/absence, and observations are number of species in respective genera/families.
tab_name <- tbname <- file.path(main_path,"Table_plants_genus_2014_cleaned.txt") genussummed_reference <- read.table(tab_name, sep="\t", row.names = 1, header=TRUE, as.is=TRUE) allFiles <- list.files(path) all_plTabs <- allFiles[grepl("planttable$", allFiles)] all_prTabs <- allFiles[grepl("planttable.luluprocessed$", allFiles)] all_Tabs <- c(all_plTabs,all_prTabs) read_tabs <- file.path(path, all_Tabs) proc_tabs <- file.path(path, paste0(all_Tabs,".for_visual_inspectionX.txt")) tab_name <- tbname <- file.path(main_path,"sample_list.txt") siteinfo <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE) tab_name <- tbname <- file.path(main_path,"Table_otu_taxonomy_plant_levels.txt") otutaxonomy <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE) row.names(otutaxonomy) <- otutaxonomy$qseqid otutab_genussummed <- list() otu_genera <- vector() for(i in seq_along(read_tabs)) { tab <- read.csv(read_tabs[i],sep='\t',header=T,as.is=TRUE,row.names = 1) tab[tab>0] <- 1 tabs <- tab[,siteinfo$Name] tabg <- tab[,siteinfo$Name] names(tabg) <- siteinfo$BW names(tabs) <- siteinfo$BW speciesnames <- otutaxonomy[row.names(tabs),"species"] tabs$species <- speciesnames tabg$genus <- str_split_fixed(speciesnames, "_", 2)[,1] otutab_genussummed[[i]] <- tabg %>% group_by(genus) %>% summarise_each(funs(sum)) otutab_genussummed[[i]] <- data.frame(otutab_genussummed[[i]]) row.names(otutab_genussummed[[i]]) <- otutab_genussummed[[i]]$genus otutab_genussummed[[i]] <- otutab_genussummed[[i]][,-1] otu_genera <- c(otu_genera,tabg$genus) # output OTU tables with species names for visual inspection (remove #) #{write.table(tabs, proc_tabs[i], sep="\t",quote=FALSE, col.names = NA)} } community_distance_table <- data.frame(matrix(0, ncol = length(all_Tabs), nrow = 130, dimnames = list(siteinfo$BW,all_Tabs))) for(i in seq_along(all_Tabs)) { otugenera_clean <- row.names(otutab_genussummed[[i]]) otugenera_clean <- otugenera_clean[otugenera_clean != ""] otugenera_clean <- otugenera_clean[otugenera_clean != "x"] obsgenera_clean <- row.names(genussummed_reference) obsgenera_clean <- obsgenera_clean[obsgenera_clean != ""] obsgenera_clean <- obsgenera_clean[obsgenera_clean != "x"] commongenera <- names(table(c(otugenera_clean,obsgenera_clean))) otuframe <- data.frame(matrix(0, nrow = length(commongenera), ncol = 130, dimnames = list(commongenera,siteinfo$BW))) otuframe[otugenera_clean,] <- otutab_genussummed[[i]][otugenera_clean,] obsframe <- data.frame(matrix(0, nrow = length(commongenera), ncol = 130, dimnames = list(commongenera,siteinfo$BW))) obsframe[obsgenera_clean,siteinfo$BW] <- genussummed_reference[obsgenera_clean,siteinfo$BW] subtracted_frame <- abs(obsframe-otuframe) absolute_distance <- colSums(subtracted_frame) community_distance_table[,i] <- absolute_distance } community_distance_table$BW <- siteinfo$BW gathered_dissimilarity <- gather(community_distance_table, method,dissimilarity,1:40) method <- str_split_fixed(gathered_dissimilarity$method, "_", 3)[,1] method[method == "DADA2"] <- "DADA2(+VS)" method[method == "DADA2VSEARCH"] <- "DADA2(+VS)" level <- str_split_fixed(gathered_dissimilarity$method, "_", 3)[,2] level <- gsub(".planttable","",level) level[level == "0.95"] <- "95" level[level == "0.96"] <- "96" level[level == "0.97"] <- "97" level[level == "0.98"] <- "98" level[level == "0.985"] <- "98.5" level[level == "NO"] <- "99/100" level[level == "3"] <- "99/100" level[level == "5"] <- "98.5" level[level == "7"] <- "98" level[level == "10"] <- "97" level[level == "13"] <- "96" level[level == "15"] <- "95" level <- factor(level,levels = c("99/100", "98.5", "98", "97", "96", "95")) processed <- str_split_fixed(gathered_dissimilarity$method, "_", 3)[,3] luluindex <- which(processed == "luluprocessed") processed[luluindex] <- "curated" processed[-luluindex] <- "raw" gathered_dissimilarity_processed <- data.frame(Method=method,Level=level,Curated=processed, Site=gathered_dissimilarity$BW, Dissimilarity=gathered_dissimilarity$dissimilarity) #save a table for ggplot (long format) tab_name <- file.path(main_path,"Table_taxonomic_dissimilatiry_long.txt") {write.table(gathered_dissimilarity_processed, tab_name, sep="\t", quote=FALSE, row.names = FALSE)} #save a table for overview (wide format) tab_name <- file.path(main_path,"Table_taxonomic_dissimilatiry_wide.txt") {write.table(community_distance_table, tab_name, sep="\t",quote=FALSE, col.names = NA)}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.